In [1]:
from math import e
Calculate psi
In [2]:
psi = lambda x, f: x * (1 - e ** (-f))
xs = [1, 2, 3]
fs = [0.9, 0.5, 0.6]
In [3]:
print "If a query arrives that is bidded on by A and B"
for i in [0, 1]:
print psi(xs[i], fs[i])
print "If a query arrives that is bidded on by A and C"
for i in [0, 2]:
print psi(xs[i], fs[i])
print "If a query arrives that is bidded on by A and B and C"
for i in [0, 1, 2]:
print psi(xs[i], fs[i])
In [4]:
import math
Flagolet-Martin algorithm
In [5]:
# return 3x + 7
def hash(x):
return (3 * x + 7) % 11
In [6]:
for i in range(1, 11):
print bin(hash(i))[2:]
In [7]:
# asymptotic
fp = lambda k, m, n: (1.0 - math.e ** (-1.0 * k * m / n)) ** k
print fp(2, 3, 10)
In [8]:
# tiny bloom filter
fp = lambda k, m, n: (1.0 - (1.0 - (1.0 / n)) ** (k * m)) ** k
print fp(2, 3, 10)
In [9]:
# Decimal number to binary number
def dec2bin(dec):
return bin(dec)[2:]
In [10]:
# Count the number of trailing zeros
def counttrailingzero(b):
cnt = 0
for i in range(len(b))[::-1]:
if b[i] == '0':
cnt += 1
else:
return cnt
return cnt
In [11]:
# Given R = max r(a), estimate number of distinct elements
def distinctelements(r):
return 2 ** r
In [12]:
print counttrailingzero(dec2bin(10))
print counttrailingzero(dec2bin(12))
print counttrailingzero(dec2bin(4))
print counttrailingzero(dec2bin(16))
print counttrailingzero(dec2bin(1))
In [13]:
import numpy as np
from copy import deepcopy
Generate adjacency list from nodes and edges
>>> nodes = ['yahoo', 'amazon', 'microsoft']
>>> edges = [('yahoo', 'yahoo'), ('yahoo', 'microsoft'), ('yahoo', 'amazon'), ('amazon', 'microsoft'), ('amazon', 'yahoo'), ('microsoft', 'amazon')]
>>> getAdjList(nodes, edges)
{0: [0, 2, 1], 1: [2, 0], 2: [1]}
In [14]:
def getAdjList(nodes, edges):
nodeMap = {nodes[i] : i for i in range(len(nodes))}
adjList = {i : [] for i in range(len(nodes))}
for u, v in edges:
adjList[nodeMap[u]].append(nodeMap[v])
return adjList
Genereate A from adjacency list
>>> adjList = {0: [0, 2, 1], 1: [2, 0], 2: [1]}
>>> A = getA(adjList)
In [15]:
def getA(adjList):
N = len(adjList)
A = np.zeros([N, N])
for u in adjList:
vs = adjList[u]
for v in vs:
A[u, v] = 1
return A
Hits algorithm
>>> adjList = {0: [0, 2, 1], 1: [2, 0], 2: [1]}
>>> A = getA(adjList)
>>> hits(A)
(array([ 0.78875329, 0.57713655, 0.21161674]), array([ 0.62790075, 0.45987097, 0.62790075]))
In [16]:
def hits(A, epsilon=10**-6, numiter=1000):
# initialize
AT = A.T
N = len(A)
aold = np.ones(N) * 1.0 / np.sqrt(N)
hold = np.ones(N) * 1.0 / np.sqrt(N)
for i in range(numiter):
hnew = A.dot(aold)
anew = AT.dot(hnew)
hnew *= np.sqrt(1.0 / sum([v * v for v in hnew]))
anew *= np.sqrt(1.0 / sum([v * v for v in anew]))
if np.sum([v * v for v in anew - aold]) < epsilon or \
np.sum([v * v for v in hnew - hold]) < epsilon:
break
hold = hnew
aold = anew
return hnew, anew
In [17]:
def main():
adjList = {0: [0, 2, 1], 1: [2, 0], 2: [1]}
A = getA(adjList)
print hits(A)
In [18]:
if __name__ == '__main__':
main()
import doctest
doctest.testmod()
In [19]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
Helper functions
In [20]:
andor = lambda x, r, b: 1 - (1 - x ** r) ** b
orand = lambda x, r, b: (1 - (1 - x) ** b) ** r
cascade = lambda x, r, b: orand(andor(x, r, b), r, b)
In [21]:
print andor(0.2, 3, 4)
In [27]:
def plot():
# Variable Initialization
k = 2
r = k ** 2
b = k ** 2
# AND-OR Construction
x1 = np.arange(0, 1, 0.01)
y1 = andor(x1, r, b)
# OR-AND Construction
x2 = np.arange(0, 1, 0.01)
y2 = orand(x2, r, b)
# Cascade Construction
x3 = np.arange(0, 1, 0.01)
y3 = cascade(x3, k, k)
# Show plot
plt.plot(x1, y1, '-r', x2, y2, '-g', x3, y3, '-b')
plt.grid(True)
plt.legend(('and-or', 'or-and', 'cascade'))
#plt.savefig('lsh.pdf')
In [28]:
plot()
In [29]:
import numpy as np
In [30]:
class MinHashing:
def __init__(self, mat, hashfunc):
self.matrix = mat
self.m = len(mat)
self.n = len(mat[0])
self.hashfunc = hashfunc
self.k = len(self.hashfunc)
def minhash(self):
self.sig = np.ones((self.k, self.n)) * 2 ** 10
for j in range(self.m):
for c in range(self.n):
if self.matrix[j][c] == 1:
for i in range(self.k):
if self.hashfunc[i](j) < self.sig[i][c]:
self.sig[i][c] = self.hashfunc[i](j)
def show(self):
print self.sig
In [31]:
if __name__ == '__main__':
hashfunc = [lambda x: (3 * x + 2) % 7, lambda x: (x - 1) % 7]
mat = [[0,1],[1,0],[0,1],[0,0],[1,1],[1,1],[1,0]]
mh = MinHashing(mat, hashfunc)
mh.minhash()
mh.show()
# 2009 final
mat = [[0,0,1],[1,1,1],[0,1,1],[1,0,0],[0,1,0]]
hashfunc = [lambda x: x + 1, lambda x: (x - 1) % 5 + 1, lambda x: (x - 2) % 5 + 1, lambda x : (x - 3) % 5 + 1, lambda x: (x - 4) % 5 + 1]
mh = MinHashing(mat, hashfunc)
mh.minhash()
mh.show()
In [32]:
import numpy as np
from copy import deepcopy
Generate adjacency list from nodes and edges
In [33]:
def getAdjList(nodes, edges):
nodeMap = {nodes[i] : i for i in range(len(nodes))}
adjList = {i : [] for i in range(len(nodes))}
for u, v in edges:
adjList[nodeMap[u]].append(nodeMap[v])
return adjList
Generate M from adjacency list
In [34]:
def getM(adjList):
size = len(adjList)
M = np.zeros([size, size])
for u in adjList:
vs = adjList[u]
n = len(vs)
for v in vs:
M[v, u] = 1.0 / n
return M
PageRank
nodes = ['y', 'a', 'm']
edges = [('y', 'y'), ('a', 'm'), ('m', 'm'), ('a', 'y'), ('y', 'a')]
adjList = getAdjList(nodes, edges)
M = getM(adjList)
print pageRank(M, 0.8)
In [35]:
def pageRank(M, beta=0.8, epsilon=10**-6, numiter=1000):
N = len(M)
const = np.ones(N) * (1 - beta) / N
rold = np.ones(N) * 1.0 / N
rnew = np.zeros(N)
for i in range(numiter):
rnew = beta * M.dot(rold)
rnew += const
if np.sum(np.abs(rold - rnew)) < epsilon:
break
rold = rnew
return rnew
Topic-specific pagerank
>>> nodes = [1, 2, 3, 4]
>>> edges = [(1, 2), (1, 3), (3, 4), (4, 3), (2, 1)]
>>> adjList = getAdjList(nodes, edges)
>>> M = getM(adjList)
>>> print topicSpecific(M, {0: 1}, 0.8)
>>> print topicSpecific(M, {0: 1}, 0.9)
>>> print topicSpecific(M, {0: 1}, 0.7)
In [36]:
def topicSpecific(M, S, beta=0.8, epsilon=10**-6, numiter=1000):
N = len(M)
rold = np.ones(N) * 1.0 / N
const = np.zeros(N)
for i in S:
const[i] = (1 - beta) * S[i]
for i in range(numiter):
rnew = M.dot(rold) * beta
rnew += const
if np.sum(np.abs(rold - rnew)) < epsilon:
break
rold = rnew
return rnew
In [37]:
def main():
nodes = ['y', 'a', 'm']
edges = [('y', 'y'), ('a', 'm'), ('m', 'm'), ('a', 'y'), ('y', 'a')]
adjList = getAdjList(nodes, edges)
M = getM(adjList)
print pageRank(M, 0.8)
In [38]:
def test():
nodes = [1,2,3,4,5,6]
edges = [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 1), (2, 1), (2, 4), (2, 6), (2, 3), (2, 5), (3, 2), (3, 6), (3, 5), (4, 1), (4, 6), (5, 2), (5, 3), (6, 1)]
adjList = getAdjList(nodes, edges)
M = getM(adjList)
print M
print pageRank(M, 0.8)
In [39]:
if __name__ == '__main__':
test()
In [40]:
from numpy import dot
In [41]:
def rh(x, v):
return 1 if dot(x, v) >= 0 else -1
In [42]:
a = [1, 0, -2, 1, -3, 0, 0]
b = [2, 0, -3, 0, -2, 0, 2]
c = [1, -1, 0, 1, 2, -2, 1]
x = [1, 1, 1, 1, 1, 1, 1]
y = [-1, 1, -1, 1, -1, 1, -1]
z = [1, 1, 1, -1, -1, -1, -1]
In [43]:
print "a"
print rh(a, x)
print rh(a, y)
print rh(a, z)
In [44]:
print "b"
print rh(b, x)
print rh(b, y)
print rh(b, z)
In [45]:
print "c"
print rh(c, x)
print rh(c, y)
print rh(c, z)
Estimate angles
In [46]:
from scipy.spatial.distance import cosine
from numpy import arccos
from numpy import pi
In [47]:
a = [-1, 1, 1]
b = [-1, 1, -1]
c = [1, -1, -1]
In [48]:
print arccos(1 - cosine(a, b)) / pi * 180
print arccos(1 - cosine(b, c)) / pi * 180
print arccos(1 - cosine(c, a)) / pi * 180
In [49]:
from scipy.spatial.distance import jaccard
from scipy.spatial.distance import cosine
In [50]:
def jaccard_sim(u, v):
return 1 - jaccard(u, v)
In [51]:
def cosine_sim(u, v):
return 1 - cosine(u, v)
In [52]:
print jaccard_sim([1,0,0,1,1], [0,1,1,1,0])
In [53]:
print cosine_sim([1,0,1], [0,1,1])
In [54]:
print cosine_sim([5.22, 1.42], [4.06, 6.39])
In [ ]: